library(stm)
library(stringr)
library(gdata)
library(extrafont)
library(igraph)

#Topic model analysis
setwd("~/Dropbox/LiebmanRobertsStern/MassDigitizationReplication/TopicModelAnalysis")
load("TopicOutput.RData")

#Get topic labels, coded by a team of research assistants
topiclabs <- read.csv("TopicLabsforRCode.csv")
topiclabs$Short <- sapply(topiclabs$Short, str_trim)
topiclabs$Category <- sapply(topiclabs$Category, str_trim)
table(topiclabs$Short)


###########################################################
#Proportion of topic by each individual category (Table 2)
###########################################################

#Benefits, Labor and Compensation
mean(apply(stm.out$theta[,which(topiclabs$Category=="Benefits, Labor and Compensation")],1,sum))
#Benefits, Labor and Compensation and Land
mean(apply(stm.out$theta[,c(which(topiclabs$Category=="Benefits, Labor and Compensation; Land"),
                            which(topiclabs$Category=="Land; Benefits, Labor and Compensation"))],1,sum))
#Fines/Punishment
mean(apply(stm.out$theta[,which(topiclabs$Category=="Fines/punishment")],1,sum))
#Fines/punishment; Benefits, Labor and Compensation
mean(stm.out$theta[,which(topiclabs$Category=="Fines/punishment; Benefits, Labor and Compensation")])

#General
mean(apply(stm.out$theta[,which(topiclabs$Category=="General")],1,sum))

#Land
mean(apply(stm.out$theta[,which(topiclabs$Category=="Land")],1,sum))

#Procedure
mean(apply(stm.out$theta[,which(topiclabs$Category=="Procedure")],1,sum))

#Permits and Registration
mean(apply(stm.out$theta[,which(topiclabs$Category=="Permits and Registration (non-land)")],1,sum))

#Procedure + land
mean(apply(stm.out$theta[,c(which(topiclabs$Category=="Procedure; land"),which(topiclabs$Category=="Land; procedure"),
                            which(topiclabs$Category=="Procedure; Land"))],1,sum))

#Land + fines/punishment
mean(stm.out$theta[,which(topiclabs$Category=="Land; fines/punishment")])

#How many of the topics are related to third party disputes?
table(topiclabs$thirdparty)

###########################################################
#Topic Model Correlation Plot, B/W
###########################################################
#GGplot2 correlation plot
#install.packages("GGally")
library(GGally)

#devtools::install_github("briatte/ggnet")
library(ggnet)
library(network)
library(sna)
library(ggplot2)
mod.out.corr <- topicCorr(stm.out, cutoff=.01)
#Take out unconnected nodes for visualization
which(apply(mod.out.corr$posadj,1, sum)==1)
adj <- mod.out.corr$posadj[apply(mod.out.corr$posadj,1, sum)!=1,apply(mod.out.corr$posadj,1, sum)!=1]
net = network(adj, directed = FALSE)
vlabels <-  paste(1:50,": ",as.character(topiclabs$Topic.Label[apply(mod.out.corr$posadj,1, sum)!=1]), sep="")

topiclab <- topiclabs$Short[apply(mod.out.corr$posadj,1, sum)!=1]
net %v% "Category" = ifelse(topiclab=="Benefits, Labor and Compensation", "Benefits, Labor, and Compensation",
                            ifelse(topiclab=="Fines/punishment", "Fines/punishment",
                                   ifelse(topiclab=="Procedure", "Procedure", 
                                          ifelse(topiclab=="Land", "Land",
                                                 ifelse(topiclab=="Permits and Registration (non-land)", "Permits and Registration (non-land)", "General")))))
lab.col <- ifelse(net %v% "Category" %in% c("Benefits, Labor, and Compensation", "Procedure", "Permits and Registration (non-land)"), "black", "white")
#pdf("CorrelationPlotGGnet.pdf", width=6, height=4)
tiff("CorrelationPlotGGnet.tiff", width=500, height=400)
ggnet2(net,
       size=apply(stm.out$theta[,topiclabs$Topic.Label[apply(mod.out.corr$posadj,1, sum)!=1]],2,mean)*70, color = "Category", shape = "Category", shape.palette = c("Benefits, Labor, and Compensation"=21, 
                                                                                                 "Fines/punishment"=15,
                                                                                                 "Procedure"=23,
                                                                                                 "Land"=16,
                                                                                                 "Permits and Registration (non-land)"=22,
                                                                                                 "General"=17), 
       palette = c("Benefits, Labor, and Compensation"="black", 
                       "Fines/punishment"="black",
                       "Procedure"="black",
                       "Land"="black",
                       "Permits and Registration (non-land)"="black",
                       "General"="black"), 
       label = c(1:82)[apply(mod.out.corr$posadj,1, sum)!=1], label.size=2,label.color = lab.col, edge.color = "white") + guides(size=FALSE)  +
  theme(panel.background = element_rect(fill = "gray"))
dev.off()

###########################################################
#Third Party Disputes, Random Sample
###########################################################
thirdparty <- read.csv("ThirdPartyCases_RandomSample500.csv")
#Case.ID = Case ID
#code = 0: not third party, 1: third party, 2: could not be determined
#court = Court that case was filed in
#hightransparency = indicates whether this court had high transparency (>.65)
#What proportion of cases were third party?
table(thirdparty$code)
prop.table(table(thirdparty$code))
#What proportion of cases in high transparency courts were third party?
table(thirdparty$code[thirdparty$hightransparency==1])
prop.table(table(thirdparty$code[thirdparty$hightransparency==1]))


